// @author HuangYeWuDeng
// @date 2018-07-17
// @description 开发者头条 toutiao.io 收藏导出

package main

import (
	"fmt"
	"log"
	"net/http"
	"strings"

	"strconv"
	"time"

	"github.com/astaxie/beego/logs"
	"github.com/knq/ini"
	"github.com/pkg/errors"
	"github.com/PuerkitoBio/goquery"
	"github.com/globalsign/mgo/bson"
)

var DefUserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36"

var blog *logs.BeeLogger
var db *StarDb

var ttStarOnly bool
var ghCollection string
var ttUser string
var ttCookie string
var ttUserAgent string
var ttStartPage int
var ttPerPage int
var ttSleep int

func init() {
	//init logger
	blog = logs.NewLogger(1024)
	blog.SetLogger("console", "")
	//blog.EnableFuncCallDepth(true)

	f, err := ini.LoadFile("config.ini")
	if err != nil {
		blog.Critical("can not load config file: %s\n", "config.ini")
	}

	g := f.GetSection("toutiao")
	ttCookie = g.Get("cookie")
	ttUserAgent = g.Get("user_agent")
	if ttUserAgent == "" {
		ttUserAgent = DefUserAgent
	}
	ttStarOnly, err = strconv.ParseBool(g.Get("star_only"))
	ttStartPage, err = strconv.Atoi(g.Get("start_page"))
	if err != nil || ttStartPage <= 0 {
		ttStartPage = 1
	}
	ttPerPage, err = strconv.Atoi(g.Get("per_page"))
	if err != nil || ttPerPage > 200 {
		ttPerPage = 100
	}
	ttSleep, err = strconv.Atoi(g.Get("sleep"))
	if err != nil || ttSleep < 100 {
		ttSleep = 500
	}
	//init db
	s := f.GetSection("mongo")
	user := s.Get("user")
	passwd := s.Get("passwd")
	host := s.Get("host")
	port := s.Get("port")
	dbname := s.Get("db_name")
	ghCollection = s.Get("collection")
	blog.Alert("mongo: get mongo config: user: %s, host: %s, port: %s\n", user, host, port)
	if !ttStarOnly {
		blog.Alert("mongo: save data to : %s.%s \n", dbname, ghCollection)
		db = New(user, passwd, host, port, dbname, "authSource=admin")
		blog.Alert("mongo: mongodb connection success")
	}
}

func httpClose(response *http.Response) {
	if nil != response && nil != response.Body {
		response.Body.Close()
	}
}

func httpRequest(method, url, cookie string) (*http.Response, error) {
	var resp *http.Response
	client := &http.Client{
		//Making an HTTP request in Go not Follow Redirects
		//see https://jonathanmh.com/tracing-preventing-http-redirects-golang/
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			return http.ErrUseLastResponse
		},
	}
	client.Timeout = time.Second * 300
	request, err := http.NewRequest(method, url, strings.NewReader(""))
	if err != nil {
		return resp, err
	}
	request.Header.Set("User-Agent", ttUserAgent)
	if cookie != "" {
		request.Header.Set("Cookie", cookie)
	}

	request.ContentLength = 0
	resp, err = client.Do(request)
	return resp, err
}

func getOnePage(cookie string, perPage, page int) error {
	url := fmt.Sprintf("https://toutiao.io/favorites?page=%d",
		page)
	// res, err := http.Get(url)
	res, err := httpRequest("GET", url, cookie)
	if err != nil {
		panic(err)
	}
	defer httpClose(res)
	if res.StatusCode != 200 {
		errors.New(fmt.Sprintf("status code error: [%d], %s", res.StatusCode, res.Status))
	}

	// Load the HTML document
	doc, err := goquery.NewDocumentFromReader(res.Body)
	if err != nil {
		log.Fatal(err)
	}

	body, err := doc.Html()
	if err != nil {
		panic(err)
	}
	if doc.Find(".post").Length() == 0 ||
		strings.Contains(string(body), "您还没有任何收藏") {
		return errors.New(fmt.Sprintf("您还没有任何收藏"))
	}

	// Find the review items
	//document.querySelectorAll('div.posts-favorites div.post')
	//document.querySelectorAll('div.posts-favorites div.post')[0].querySelectorAll('div.content h3.title a')[0]
	// <a target="_blank" rel="external" title="HTTP 缓存详解 - 开发者头条" href="/k/cf155k">HTTP 缓存详解</a>
	doc.Find("div.post").Each(func(i int, s *goquery.Selection) {
		// For each item found, get the band and title
		title := s.Find("div.content h3.title a").Text()
		url,_ := s.Find("div.content h3.title a").Attr("href")
		url = "https://toutiao.io" + url
		realUrl := ""
		detectResp,err := httpRequest("GET", url, cookie)
		if err != nil {
			panic(err)
		}
		//fmt.Printf("code:%d, header:%#v\n", detectResp.StatusCode, detectResp.Header)
		if detectResp.StatusCode == 302 {
			realUrl = detectResp.Header.Get("location")
		}
		fmt.Printf("fav %d: %s - %s , realUrl: %s\n", i, title, url, realUrl)
		if !ttStarOnly {
			fav := Favorite{
				ID: bson.NewObjectId(),
				Title: title,
				ToutiaoUrl: url,
				OrigUrl: realUrl,
				CreatedAt: time.Now(),
			}
			err = db.Insert("toutiao", fav)
			if err != nil {
				panic(err)
			}
		}
	})
	return nil
}

func toutiaoStarredScrape() {
	//https://api.toutiao.com/users/ihacklog/starred?page=1&per_page=100
	page := ttStartPage
	for {
		blog.Alert("start to fetch page: %d, per_page: %d ======================>",
			page, ttPerPage)
		err := getOnePage(ttCookie, ttPerPage, page)
		if err != nil {
			blog.Notice("%s <====================== ", err.Error())
			break
		}
		blog.Alert("done fetch page: %d. sleep for %d ms <====================== \n\n",
			page, ttSleep)
		time.Sleep(time.Millisecond * time.Duration(ttSleep))
		page++
	}
}

func main() {
	toutiaoStarredScrape()
}
